clear all
capture log close
program drop _all
set more off
snapshot erase _all
sysdir set PLUS "M:\Ado\Plus"
set scheme graphscheme
graph set window fontface "Calibri"

********************************************************************************************
********************************************************************************************
********************************************************************************************
*** File name: 		Produce Statistics and Figures for Blog Post.do
*** Last updated: 	2/6/2016
***
*** This file reads in the raw and pareto-adjusted ginis, then
***		1. Calculate miscellanous statistics for blog post
***		2. Produces figures for blog post
***		3. Produces figures for technical appendix 
********************************************************************************************
********************************************************************************************
********************************************************************************************

cd "$directory"

********************************************************************************************
********************************************************************************************
********************************************************************************************
*** 1. Calcuate miscellanous statistics for blog post
***		a. Read in the data
*** 	b. Descriptive stats on sample size
***		c. Ginis before and after adjustment
***		d. Top decile shares after the adjustment
********************************************************************************************
********************************************************************************************
********************************************************************************************

*****************************************************************
*****************************************************************
*** a. Read in the data
*****************************************************************
*****************************************************************
use "Output Data/Raw and Pareto-Adjusted Ginis.dta", clear

*****************************************************************
*****************************************************************
*** b. Descriptive stats on sample size
*****************************************************************
*****************************************************************

**************************************
*** Number of surveys
**************************************
count if gini_adj < .

**************************************
*** Number of countries
**************************************
preserve
keep if gini_adj < .
contract countryname
count 
restore

*****************************************************************
*****************************************************************
*** c. Ginis before and after adjustment
*****************************************************************
*****************************************************************

**************************************
*** Calculate mean raw gini, mean pareto-adjusted gini, and mean change in gini
**************************************

*** Calculate change in gini
gen gini_diff = gini_adj - gini

*** Calculate mean raw gini, mean pareto-adjusted gini, and mean change in gini

label list updated
tabstat gini gini_adj gini_diff if inlist(updated, 1, 3), format(%10.2fc) col(stats)

**************************************
*** Calculate average percent of adjusted population captured by the original survey
**************************************
tabstat survey_pct if inlist(updated, 1, 3), format(%10.3fc) col(stats)

**************************************
*** Display most recent gini before and after adjustment for select countries
**************************************
bys countryname: egen max_year = max(year)
format gini* %4.3fc
list countryname gini gini_adj if inlist(countryname, "China", "India", "United States") & year == max_year, ab(20)
drop max_year

**************************************
*** Top ten most unequal countries, before and after the adjustment
**************************************

sort countryname year surveytype

*** Drop countries with no national accounts data, and countries for which alpha does not exist
drop if gini_adj == .

*** Keep consumption surveys if we have both income and consumption
by countryname year: drop if _N == 2 & surveytype == "I"

*** Keep the most recent survey for each country
by countryname: keep if _n == _N

*** Drop any surveys before 2005
drop if year < 2005
codebook countryname

*** Show top ten most unequal countries before the adjustment
gsort -gini
list countryname gini gini_adj if _n <= 10, ab(20)

*** Show top ten most unequal countries after the adjustment
gsort -gini_adj
list countryname gini gini_adj if _n <= 10, ab(20)


**************************************
*** Regional averages before and after the adjustment
**************************************

*** Merge in 2014 population for population weights
preserve
use "Input Data/Population Data.dta", clear
label list urbrur
keep if urbrur == 3 & year == 2014
rename pop_urbrur pop2014
keep countryname year pop2014
tempfile pop2014
save `pop2014'.dta, replace
restore
merge 1:1 countryname using `pop2014'.dta, assert(2 3) keep(3)

*** Get regional averages
collapse (mean) gini gini_adj (count)n=gini [weight=pop2014], by(region)
format gini* %10.3fc

*** Show regional averages sorted by original and adjusted gini
gsort -gini
list region n gini*, sep(100)
gsort -gini_adj
list region n gini*, sep(100)

*****************************************************************
*****************************************************************
*** d. Top decile shares after the adjustment
*****************************************************************
*****************************************************************

**************************************
*** Read in the p's and l's data
**************************************
use "Output Data/Raw and Pareto-Adjusted P's and L's.dta", clear

**************************************
*** Calculate top decile shares, and the percent of surveys with top decile shares under 50%
**************************************

*** Select adjusted p value closest to 90 percent
gen diff = abs(.9-p_adj)
bys countryname year surveytype: egen min_diff = min(diff)

*** Get share of income owned by the top decile
gen top_decile_l = 1-l_adj if diff == min_diff
drop if top_decile_l == .
isid countryname year surveytype

*** Calculate % of surveys with top decile shares under 50%
gen under50 = (top_decile_l < .5)
tab under50, m

********************************************************************************************
********************************************************************************************
********************************************************************************************
*** 2. Produce figures for blog post
***		a. Figure 1: Ginis Before & After Adjustment for Missing Top Incomes
***		b. Figure 2: Average Gini Across Countries, Adjusted for Missing Top Incomes 
********************************************************************************************
********************************************************************************************
********************************************************************************************

*****************************************************************
*****************************************************************
*** a. Figure 1: Ginis Before & After Adjustment for Missing Top Incomes
*****************************************************************
*****************************************************************

**************************************
*** Read in the data
**************************************
use "Output Data/Raw and Pareto-Adjusted Ginis.dta", clear

**************************************
*** Keep only most recent data
**************************************

sort countryname year surveytype

*** Drop countries with no national accounts data, and countries for which alpha does not exist
drop if gini_adj == .

*** Keep consumption surveys if we have both income and consumption
by countryname year: drop if _N == 2 & surveytype == "I"

*** Keep the most recent survey for each country
by countryname: keep if _n == _N

*** Drop any surveys before 2005
drop if year < 2005
codebook countryname

**************************************
*** Get colors for graphs
*** Note: 1 = bold colors, 2 = light colors
**************************************
preserve
import excel using "Input Data/Graph Colors.xlsm", clear firstrow
forvalues i = 1/7 {
	foreach color in "r" "b" "g" {
		forvalues j = 1/2 {
			local num = `color'`j'[`i']
			local `color'`j' = "``color'`j''" + "`num' "
		}
	}
}
restore

**************************************
*** Create cleaner country names for graphs
**************************************
clonevar countryname_orig=countryname
replace countryname = subinstr(countryname, " and ", " & ", .)
split countryname, parse(",")
replace countryname = countryname1 if countryname2 != "" & countryname1 != "Congo"
drop countryname1 countryname1
replace countryname = "Russia" if strpos(countryname, "Russia")

**************************************
*** Graph raw and adjusted ginis for each country
**************************************

*** Calcualte the difference between the raw and pareto adjusted ginis
gen gini_diff = gini_adj - gini

*** Split data into regions
foreach gini in gini gini_diff {
	separate `gini', by(region)
}

*** Create code for graph & legend & colors
levelsof region, local(regions)
local lbl = ""
local graph_vars = ""
local graph_colors = ""
forvalues i = 1/7 {
	
	local bar_num1 = 2*`i'-1
	local bar_num2 = 2*`i'
	
	*** Label
	local region: word `i' of `regions'
	local lbl = `"`lbl'label(`bar_num1' "`region'") "'

	*** Graph variables
	local graph_vars = "`graph_vars'gini`i' gini_diff`i' "
	
	*** Graph colors
	forvalues j = 1/2 {
		local red`j': word `i' of `r`j''
		local green`j': word `i' of `g`j''
		local blue`j': word `i' of `b`j''
		local graph_colors = `"`graph_colors'bar(`bar_num`j'', color("`red`j'' `green`j'' `blue`j''")) "'
	}
	

}
graph bar `graph_vars', ///
	legend(`lbl' order(1 3 5 7 9 11 13) col(4)) `graph_colors' ///
	over(countryname, sort(gini) descending label(angle(90) labsize(vsmall) labgap(*.5))) stack nofill ///
	yscale(noline) ylabel(, tlength(0)) legend(row(1) order(1 5 11 9 7 13 3) symxsize(1)) ///
	title("Ginis Before & After Adjustment for Missing Top Incomes") ///
	note("Authors' calculations based on survey data from PovcalNet and the Luxembourg Income Study and national accounts data from the World Development Indicators.", size(vsmall))
graph display, xsize(14) ysize(6)
graph export "Graphs/Figure 1.pdf", replace

*****************************************************************
*****************************************************************
*** b. Figure 2: Average Gini Across Countries, Adjusted for Missing Top Incomes
*****************************************************************
*****************************************************************

**************************************
*** Read in the data
**************************************
use "Output Data/Raw and Pareto-Adjusted Ginis.dta", clear

**************************************
*** Get colors for graphs
**************************************
preserve
import excel using "Input Data/Graph Colors.xlsm", clear firstrow
forvalues i = 1/2 {
	foreach color in "r" "b" "g" {
		local `color'`i' = `color'1[`i']
	}
}
restore

**************************************
*** Assign surveys to benchmark years
**************************************

*** Mark the survey closest to each benchmark year (five-year intervals from 1993-2013), seperately by survey type for each coutnry 
gen benchmark_year = .
forvalues year = 1993(5)2013 {

	gen diff = abs(`year'-year) if gini_adj < . // Only cases with non-missing pareto adjustment
	bys countryname surveytype: egen min_diff = min(diff)
	replace benchmark_year = `year' if diff == min_diff & diff <= 2

	*** Pick the more recent survey if two surveys are equally close to the benchmark year (that is, before and after the benchmark year)
	bys countryname surveytype: egen num_selected = total(benchmark_year == `year')
	replace benchmark_year = . if num_selected >= 2 & num_selected < . & benchmark_year == `year' & year - `year' < 0 

	drop diff min_diff num_selected

}

*** Use income or consumption surveys, whichever we have more of (for maximum consistentcy)
bys countryname surveytype: egen num_selected = total(benchmark_year < .)
bys countryname: egen max_num_selected = max(num_selected)
bys countryname: egen min_num_selected = min(num_selected)
gen benchmark_year2 = benchmark_year if num_selected == max_num_selected & min_num_selected != max_num_selected
replace benchmark_year2 = benchmark_year if surveytype == "C" & min_num_selected == max_num_selected
*** If no survey of the preferred type (income or consumption) was identified for any given benchmark year, use a survey of the other type
forvalues year = 1993(5)2013 {
	gen has`year'_temp = (benchmark_year2 == `year')
	bys countryname: egen has`year' = max(has`year'_temp)
	replace benchmark_year2 = benchmark_year if benchmark_year == `year' & has`year' == 0
	drop has`year'*
}

drop num_selected max_num_selected benchmark_year2

**************************************
*** Merge in population for the benchmark year
**************************************

preserve
use "Input Data/Population Data.dta", clear
label list urbrur
keep if urbrur == 3
rename (year pop_urbrur) (benchmark_year pop_benchmark_yr)
keep countryname benchmark_year pop_benchmark_yr
tempfile pop_benchmark_yr
save `pop_benchmark_yr'.dta, replace
restore
merge m:1 countryname benchmark_year using `pop_benchmark_yr'.dta, keep(3) nogen norep

**************************************
*** Define the balanced sample - countries for which we have surveys of the same survey type (income/consumption) for all five benchmark years
**************************************
bys countryname surveytype: egen num_selected = total(benchmark_year < .)
gen balanced = (num_selected == 5)

**************************************
*** Calculate mean global gini in each benchmark year four ways:
***	- Full sample, unweighted by population
***	- Balanced sample, unweighted by population
*** - Full sample, weighted by population
*** - Balanced sample, weighted by population
**************************************

**** Unweighted
preserve
gen gini_adj_bal = gini_adj if balanced == 1 
collapse (mean) unweighted=gini_adj unweighted_bal=gini_adj_bal, by(benchmark_year)
tempfile unweighted
label variable unweighted "Unweighted"
label variable unweighted_bal "Unweighted, Balanced"
save `unweighted'.dta, replace
restore

*** Weighted
preserve
gen gini_adj_bal = gini_adj if balanced == 1 
collapse (mean) weighted=gini_adj weighted_bal=gini_adj_bal [weight=pop_benchmark_yr], by(benchmark_year)
label variable weighted "Population Weighted"
label variable weighted_bal "Population Weighted, Balanced"
tempfile weighted
save `weighted'.dta, replace
restore

**************************************
*** Combine and graph the results
**************************************
use `unweighted'.dta, replace
merge 1:1 benchmark_year using `weighted'.dta, assert(3) nogen norep
twoway (line weighted benchmark_year, lcolor("`r1' `g1' `b1'")) ///
	(line unweighted benchmark_year, lcolor("`r2' `g2' `b2'")) ///
	(line weighted_bal benchmark_year, lcolor("`r1' `g1' `b1'") lpattern(dash) lwidth(medthick) ) ///
	(line unweighted_bal benchmark_year, lcolor("`r2' `g2' `b2'") lpattern(dash) lwidth(medthick) ) ///
	, ylabel(.3(.1).6, tlength(zero)) ytick(.3(.1).6, noticks grid) yscale(range(.3(.1).6) noline) xlabel(1993(5)2013) xtitle("") legend(order(2 1 4 3) col(2))  ///
	title("Average Gini Across Countries, Adjusted for Missing Top Incomes") plotregion(margin(zero)) ///
	note("Authors' calculations based on survey data from PovcalNet and the Luxembourg Income Study, national accounts data from the World Development" "Indicators, and population data from the World Development Indicators.", size(vsmall))
graph display, xsize(8) ysize(5)

*** Save the final graph
graph export  "Graphs/Figure 2.pdf", replace

********************************************************************************************
********************************************************************************************
********************************************************************************************
*** 3. Produce figures for technical appendix
***		a. Figure A2: Sample Lorenz Curves, Before & After the Adjustment
***		b. Figure A3: Change in Gini Coefficient After Adjustment
***		c. Figure A4: Missing Top Incomes as a Share of Total Population
********************************************************************************************
********************************************************************************************
********************************************************************************************

*****************************************************************
*****************************************************************
*** a. Figure A2: Sample Lorenz Curves, Before & After the Adjustment
*****************************************************************
*****************************************************************

**************************************
*** Read in the raw and adjusted P's and L's
**************************************
use "Output Data/Raw and Pareto-Adjusted P's and L's.dta", clear

**************************************
*** Graph a sample distribution
**************************************
egen id = group(countryname year urbrur surveytype) if alpha  < .
label variable l "Survey Values" 
label variable l_adj "Survey Values Adjusted for Missing Top Incomes"
twoway (line l p if id == 3) (line l_adj p_adj if id == 3), ///
	xtitle("Percent of Population") ytitle("Percent of Income") ///
	yscale(alt) ylabel(0(.2)1, nogrid) ytick(0(.2)1, nogrid) plotregion(margin(tiny)) ///
	title("Sample Lorenz Curves, Before & After the Adjustment", size(medium)) legend(col(2) symxsize(2))
graph display, xsize(7) ysize(7)
graph export "Graphs/Figure A2.pdf", replace

*****************************************************************
*****************************************************************
*** b. Figure A3: Change in Gini Coefficient After Adjustment
*****************************************************************
*****************************************************************

**************************************
*** Read in the data
**************************************
use "Output Data/Raw and Pareto-Adjusted Ginis.dta", clear

**************************************
*** Create the graph (including only surveys for which we made an adjustment)
**************************************

*** Calculate the change in gini
gen gini_diff = gini_adj - gini

*** Calculate the ratio of survey mean to national account mean
gen ratio = survey_mean/na_mean

twoway scatter gini_diff ratio if updated == 1, msize(vsmall) ///
	ytitle("Absolute Change in Gini Coefficient") xtitle("Percent of National Accounts Income Captured by Survey") ///
	title("Change in Gini Coefficient After Adjustment")
graph export "Graphs/Figure A3.pdf", replace

*****************************************************************
*****************************************************************
*** c. Figure A4: Missing Top Incomes as a Share of Total Population
*****************************************************************
*****************************************************************

**************************************
*** Create the graph (including only surveys for which we made an adjustment)
**************************************

*** Calculate the percent of the population missing from the survey
gen missing_pct = 1-survey_pct

twoway scatter missing_pct ratio if updated == 1, msize(vsmall) ///
	ytitle("Missing Top Incomes as a Share of Total Adjusted Population") xtitle("Percent of National Accounts Income Captured by Survey") ///
	title("Percent of Population Missing from Survey")
graph export "Graphs/Figure A4.pdf", replace
